Goal: Dan suggested skipping the PCA step and just looking for metabolites associated with leaf length via penalized regression.
library(glmnet)
Loading required package: Matrix
Loaded glmnet 4.1
library(relaimpo)
Loading required package: MASS
Loading required package: boot
Loading required package: survey
Loading required package: grid
Loading required package: survival
Attaching package: ‘survival’
The following object is masked from ‘package:boot’:
aml
Attaching package: ‘survey’
The following object is masked from ‘package:graphics’:
dotchart
Loading required package: mitools
This is the global version of package relaimpo.
If you are a non-US user, a version with the interesting additional metric pmvd is available
from Ulrike Groempings web site at prof.beuth-hochschule.de/groemping.
library(tidyverse)
Registered S3 methods overwritten by 'dbplyr':
method from
print.tbl_lazy
print.tbl_sql
── Attaching packages ────────────────────────────────────────────── tidyverse 1.3.0 ──
✓ ggplot2 3.3.3 ✓ purrr 0.3.4
✓ tibble 3.0.6 ✓ dplyr 1.0.4
✓ tidyr 1.1.2 ✓ stringr 1.4.0
✓ readr 1.4.0 ✓ forcats 0.5.1
── Conflicts ───────────────────────────────────────────────── tidyverse_conflicts() ──
x tidyr::expand() masks Matrix::expand()
x dplyr::filter() masks stats::filter()
x dplyr::lag() masks stats::lag()
x tidyr::pack() masks Matrix::pack()
x dplyr::select() masks MASS::select()
x tidyr::unpack() masks Matrix::unpack()
library(broom)
get leaflength data
leaflength <- read_csv("../../plant/output/leaf_lengths_metabolite.csv") %>%
mutate(pot=str_pad(pot, width=3, pad="0"),
sampleID=str_c("wyo", genotype, pot, sep="_")) %>%
select(sampleID, genotype, trt, leaf_avg_std)
── Column specification ───────────────────────────────────────────────────────────────
cols(
pot = col_double(),
soil = col_character(),
genotype = col_character(),
trt = col_character(),
leaf_avg = col_double(),
leaf_avg_std = col_double()
)
leaflength %>% arrange(sampleID)
get and wrangle metabolite data
met_raw <-read_csv("../input/metabolites_set1.csv")
── Column specification ───────────────────────────────────────────────────────────────
cols(
.default = col_double(),
tissue = col_character(),
soil = col_character(),
genotype = col_character(),
autoclave = col_character(),
time_point = col_character(),
concatenate = col_character()
)
ℹ Use `spec()` for the full column specifications.
met <- met_raw %>%
mutate(pot=str_pad(pot, width = 3, pad = "0")) %>%
mutate(sampleID=str_c("wyo", genotype, pot, sep="_")) %>%
select(sampleID, genotype, tissue, sample_mass = `sample_mass mg`, !submission_number:concatenate) %>%
pivot_longer(!sampleID:sample_mass, names_to = "metabolite", values_to = "met_amount") %>%
#adjust by sample mass
mutate(met_per_mg=met_amount/sample_mass) %>%
#scale and center
group_by(metabolite, genotype, tissue) %>%
mutate(met_per_mg=scale(met_per_mg),
met_amt=scale(met_amount)
) %>%
pivot_wider(id_cols = sampleID,
names_from = c(tissue, metabolite),
values_from = starts_with("met_"),
names_sep = "_")
met
split this into two data frames, one normalized by tissue amount and one not.
met_per_mg <- met %>% select(sampleID, starts_with("met_per_mg")) %>%
as.data.frame() %>% column_to_rownames("sampleID")
met_amt <- met %>% select(sampleID, starts_with("met_amt")) %>%
as.data.frame() %>% column_to_rownames("sampleID")
get leaf data order to match
leaflength <- leaflength[match(met$sampleID, leaflength$sampleID),]
leaflength
Fit 101 CVs for each of 11 alphas
set.seed(1245)
folds <- tibble(run=1:101) %>%
mutate(folds=map(run, ~ sample(rep(1:6,6))))
system.time (met_per_mg_multiCV <- expand_grid(run=1:100, alpha=round(seq(0,1,.1),1)) %>%
left_join(folds, by="run") %>%
mutate(fit=map2(folds, alpha, ~ cv.glmnet(x=as.matrix(met_per_mg),
y=leaflength$leaf_avg_std,
foldid = .x, alpha=.y
)))
#, lambda=exp(seq(-5,0,length.out = 50)) )))
) #100 seconds
user system elapsed
194.727 32.263 242.800
head(met_per_mg_multiCV)
for each fit, pull out the mean cv error, lambda, min lambda, and 1se lambda
met_per_mg_multiCV <- met_per_mg_multiCV %>%
mutate(cvm=map(fit, magrittr::extract("cvm")),
lambda=map(fit, magrittr::extract("lambda")),
lambda.min=map_dbl(fit, magrittr::extract("lambda.min" )),
lambda.1se=map_dbl(fit, magrittr::extract("lambda.1se")),
nzero=map(fit, magrittr::extract("nzero"))
)
head(met_per_mg_multiCV)
now calculate the mean and sem of cvm and min,1se labmdas. These need to be done separately because of the way the grouping works
met_per_mg_summary_cvm <- met_per_mg_multiCV %>% dplyr::select(-fit, -folds) %>%
unnest(c(cvm, lambda)) %>%
group_by(alpha, lambda) %>%
summarize(meancvm=mean(cvm), sem=sd(cvm)/sqrt(n()), high=meancvm+sem, low=meancvm-sem)
`summarise()` has grouped output by 'alpha'. You can override using the `.groups` argument.
met_per_mg_summary_cvm
met_per_mg_summary_lambda <- met_per_mg_multiCV %>% dplyr::select(-fit, -folds, -cvm) %>%
group_by(alpha) %>%
summarize(
lambda.min.sd=sd(lambda.min),
lambda.min.mean=mean(lambda.min),
#lambda.min.med=median(lambda.min),
lambda.min.high=lambda.min.mean+lambda.min.sd,
#lambda.min.low=lambda.min.mean-lambda.min.sem,
#lambda.1se.sem=sd(lambda.1se)/sqrt(n()),
lambda.1se.mean=mean(lambda.1se),
#lambda.1se.med=median(lambda.1se),
#lambda.1se.high=lambda.1se+lambda.1se.sem,
#lambda.1se.low=lambda.1se-lambda.1se.sem,
nzero=nzero[1],
lambda=lambda[1]
)
met_per_mg_summary_lambda
plot it
met_per_mg_summary_cvm %>%
#filter(alpha!=0) %>% # worse than everything else and throwing the plots off
ggplot(aes(x=log(lambda), y= meancvm, ymin=low, ymax=high)) +
geom_ribbon(alpha=.25) +
geom_line(aes(color=as.character(alpha))) +
facet_wrap(~ as.character(alpha)) +
coord_cartesian(xlim=(c(-5,0))) +
geom_vline(aes(xintercept=log(lambda.min.mean)), alpha=.5, data=met_per_mg_summary_lambda) +
geom_vline(aes(xintercept=log(lambda.min.high)), alpha=.5, data=met_per_mg_summary_lambda, color="blue")
Make a plot of MSE at minimum lambda for each alpha
met_per_mg_summary_cvm %>%
group_by(alpha) %>%
filter(rank(meancvm, ties.method = "first")==1) %>%
ggplot(aes(x=alpha,y=meancvm,ymin=low,ymax=high)) +
geom_ribbon(color=NA, fill="gray80") +
geom_line() +
geom_point()
Plot the number of nzero coefficients
met_per_mg_summary_lambda %>%
unnest(c(lambda, nzero)) %>%
group_by(alpha) %>%
filter(abs(lambda.min.mean-lambda)==min(abs(lambda.min.mean-lambda)) ) %>%
ungroup() %>%
ggplot(aes(x=as.character(alpha), y=nzero)) +
geom_point() +
ggtitle("Number of non-zero coefficents at minimum lambda") +
ylim(0,36)
OK let’s do repeated test train starting from these CV lambdas
multi_tt <- function(lambda, alpha, n=10000, sample_size=36, train_size=30, x, y=leaflength$leaf_avg_std) {
print(lambda)
print(alpha)
tt <-
tibble(run=1:n) %>%
mutate(train=map(run, ~ sample(1:sample_size, train_size))) %>%
mutate(fit=map(train, ~ glmnet(x=x[.,], y=y[.], lambda = lambda, alpha = alpha ))) %>%
mutate(pred=map2(fit, train, ~ predict(.x, newx = x[-.y,]))) %>%
mutate(cor=map2_dbl(pred, train, ~ cor(.x, y[-.y]) )) %>%
mutate(MSE=map2_dbl(pred, train, ~ mean((y[-.y] - .x)^2))) %>%
summarize(
num_na=sum(is.na(cor)),
num_lt_0=sum(cor<=0, na.rm=TRUE),
avg_cor=mean(cor, na.rm=TRUE),
avg_MSE=mean(MSE))
tt
}
per_mg_fit_test_train <- met_per_mg_summary_lambda %>%
select(alpha, lambda.min.mean)
per_mg_fit_test_train <- met_per_mg_multiCV %>%
filter(run==1) %>%
select(alpha, fit) %>%
right_join(per_mg_fit_test_train)
Joining, by = "alpha"
per_mg_fit_test_train <- per_mg_fit_test_train %>%
mutate(pred_full=map2(fit, lambda.min.mean, ~ predict(.x, s=.y, newx=as.matrix(met_per_mg))),
full_R=map_dbl(pred_full, ~ cor(.x, leaflength$leaf_avg_std)),
full_MSE=map_dbl(pred_full, ~ mean((leaflength$leaf_avg_std-.x)^2))) %>%
mutate(tt=map2(lambda.min.mean, alpha, ~ multi_tt(lambda=.x, alpha=.y, x=as.matrix(met_per_mg))))
[1] 53.31973
[1] 0
[1] 1.526197
[1] 0.1
[1] 1.083632
[1] 0.2
[1] 0.8767108
[1] 0.3
[1] 0.7482567
[1] 0.4
[1] 0.6440281
[1] 0.5
[1] 0.560638
[1] 0.6
[1] 0.5124209
[1] 0.7
[1] 0.4670289
[1] 0.8
[1] 0.4206258
[1] 0.9
[1] 0.3858907
[1] 1
(per_mg_fit_test_train <- per_mg_fit_test_train %>% unnest(tt))
per_mg_fit_test_train %>%
ggplot(aes(x=alpha)) +
geom_line(aes(y=avg_cor), color="red") +
geom_point(aes(y=avg_cor), color="red") +
geom_line(aes(y=avg_MSE), color="blue") +
geom_point(aes(y=avg_MSE), color="blue")
alpha_per_mg <- .5
best_per_mg <- per_mg_fit_test_train %>% filter(alpha == alpha_per_mg)
best_per_mg_fit <- best_per_mg$fit[[1]]
best_per_mg_lambda <- best_per_mg$lambda.min.mean
per_mg_coef.tb <- coef(best_per_mg_fit, s=best_per_mg_lambda) %>%
as.matrix() %>% as.data.frame() %>%
rownames_to_column(var="PC") %>%
rename(beta=`1`)
per_mg_coef.tb %>% filter(beta!=0) %>% arrange(beta)
NA
STOPPED HERE
pred and obs
plot(leaflength$leaf_avg_std, best_per_mg$pred_full[[1]])
cor.test(leaflength$leaf_avg_std, best_per_mg$pred_full[[1]]) #.57
Pearson's product-moment correlation
data: leaflength$leaf_avg_std and best_per_mg$pred_full[[1]]
t = 7.6535, df = 34, p-value = 6.765e-09
alternative hypothesis: true correlation is not equal to 0
95 percent confidence interval:
0.6320984 0.8911068
sample estimates:
cor
0.7954463
best_per_mg$full_MSE
[1] 0.6345576
per_mg_vars <- per_mg_coef.tb %>%
filter(beta !=0, PC!="(Intercept)") %>%
pull(PC) %>% c("leaf_avg_std", .)
per_mg_relimp <- leaflength %>% select(leaf_avg_std) %>% cbind(met_per_mg.PCs) %>% as.data.frame() %>% dplyr::select(all_of(per_mg_vars)) %>%
calc.relimp()
per_mg_coef.tb <- per_mg_relimp@lmg %>% as.matrix() %>% as.data.frame() %>%
rownames_to_column("PC") %>%
rename(PropVar_met_per_mg=V1) %>%
full_join(per_mg_coef.tb) %>%
arrange(desc(PropVar_met_per_mg))
per_mg_coef.tb
lm with gt and trt
lmtest <- met_per_mg.leaf_PCA$x %>%
as.data.frame() %>%
rownames_to_column("sampleID") %>%
left_join(leaflength) %>%
select(sampleID, genotype, trt, starts_with("PC")) %>%
mutate(trt=ifelse(str_detect(trt, "dead|BLANK"), "deadBLANK", trt)) %>%
pivot_longer(cols=starts_with("PC"), names_to = "PC") %>%
mutate(PC=str_c("leaf_", PC)) %>%
group_by(PC) %>%
nest() %>%
mutate(lm_add=map(data, ~ lm(value ~ genotype + trt, data=.)),
lm_int=map(data, ~ lm(value ~ genotype*trt, data=.)))
lmtest %>% mutate(broomtidy = map(lm_add, tidy)) %>%
unnest(broomtidy) %>%
select(PC, term, p.value) %>%
filter(! str_detect(term, "Intercept"),
p.value < 0.1) %>%
arrange(term, p.value)
lmtest %>% mutate(broomtidy = map(lm_int, broom::tidy)) %>%
unnest(broomtidy) %>%
select(PC, term, p.value) %>%
filter(! str_detect(term, "Intercept"),
p.value < 0.1) %>%
arrange(term, p.value)
lmtest <- met_per_mg.root_PCA$x %>%
as.data.frame() %>%
rownames_to_column("sampleID") %>%
left_join(leaflength) %>%
select(sampleID, genotype, trt, starts_with("PC")) %>%
mutate(trt=ifelse(str_detect(trt, "dead|BLANK"), "deadBLANK", trt)) %>%
pivot_longer(cols=starts_with("PC"), names_to = "PC") %>%
mutate(PC=str_c("root_", PC)) %>%
group_by(PC) %>%
nest() %>%
mutate(lm_add=map(data, ~ lm(value ~ genotype + trt, data=.)),
lm_int=map(data, ~ lm(value ~ genotype*trt, data=.)))
lmtest %>% mutate(broomtidy = map(lm_add, tidy)) %>%
unnest(broomtidy) %>%
select(PC, term, p.value) %>%
filter(! str_detect(term, "Intercept"),
p.value < 0.1) %>%
arrange(term, p.value)
lmtest %>% mutate(broomtidy = map(lm_int, broom::tidy)) %>%
unnest(broomtidy) %>%
select(PC, term, p.value) %>%
filter(! str_detect(term, "Intercept"),
p.value < 0.1) %>%
arrange(term, p.value)
Checkout the rotations.
met_per_mg_rotation_out <- met_per_mg.PC_rotation %>%
pivot_longer(-metabolite, names_to="PC", values_to="loading") %>%
filter(PC %in% filter(per_mg_coef.tb, beta!=0)$PC ) %>%
group_by(PC) %>%
filter(!str_detect(metabolite,".*(leaf|root)_[0-9]*$")) %>%
filter(abs(loading) >= 0.05) %>%
left_join(per_mg_coef.tb, by="PC") %>%
arrange(desc(abs(beta)), desc(abs(loading))) %>%
mutate(organ=ifelse(str_detect(metabolite, "_leaf_"), "leaf", "root"),
transformation="normalized",
metabolite=str_remove(metabolite, "met_per_mg_(root|leaf)_"),
metabolite_effect_on_leaf=ifelse(beta*loading>0, "increase", "decrease"))
met_per_mg_rotation_out %>% write_csv("../output/Leaf_associated_metabolites_normalized.csv")
met_per_mg_rotation_out
Fit 101 CVs for each of 11 alphas
set.seed(1245)
folds <- tibble(run=1:101) %>%
mutate(folds=map(run, ~ sample(rep(1:6,6))))
system.time (met_amt_multiCV <- expand_grid(run=1:100, alpha=round(seq(0,1,.1),1)) %>%
left_join(folds, by="run") %>%
mutate(fit=map2(folds, alpha, ~ cv.glmnet(x=met_amt.PCs, y=leaflength$leaf_avg_std, foldid = .x, alpha=.y
)))
#, lambda=exp(seq(-5,0,length.out = 50)) )))
) #100 seconds
head(met_amt_multiCV)
for each fit, pull out the mean cv error, lambda, min lambda, and 1se lambda
met_amt_multiCV <- met_amt_multiCV %>%
mutate(cvm=map(fit, magrittr::extract("cvm")),
lambda=map(fit, magrittr::extract("lambda")),
lambda.min=map_dbl(fit, magrittr::extract("lambda.min" )),
lambda.1se=map_dbl(fit, magrittr::extract("lambda.1se")),
nzero=map(fit, magrittr::extract("nzero"))
)
head(met_amt_multiCV)
now calculate the mean and sem of cvm and min,1se labmdas. These need to be done separately because of the way the grouping works
met_amt_summary_cvm <- met_amt_multiCV %>% dplyr::select(-fit, -folds) %>%
unnest(c(cvm, lambda)) %>%
group_by(alpha, lambda) %>%
summarize(meancvm=mean(cvm), sem=sd(cvm)/sqrt(n()), high=meancvm+sem, low=meancvm-sem)
met_amt_summary_cvm
met_amt_summary_lambda <- met_amt_multiCV %>% dplyr::select(-fit, -folds, -cvm) %>%
group_by(alpha) %>%
summarize(
lambda.min.sd=sd(lambda.min),
lambda.min.mean=mean(lambda.min),
#lambda.min.med=median(lambda.min),
lambda.min.high=lambda.min.mean+lambda.min.sd,
#lambda.min.low=lambda.min.mean-lambda.min.sem,
#lambda.1se.sem=sd(lambda.1se)/sqrt(n()),
lambda.1se.mean=mean(lambda.1se),
#lambda.1se.med=median(lambda.1se),
#lambda.1se.high=lambda.1se+lambda.1se.sem,
#lambda.1se.low=lambda.1se-lambda.1se.sem,
nzero=nzero[1],
lambda=lambda[1]
)
met_amt_summary_lambda
plot it
met_amt_summary_cvm %>%
#filter(alpha!=0) %>% # worse than everything else and throwing the plots off
ggplot(aes(x=log(lambda), y= meancvm, ymin=low, ymax=high)) +
geom_ribbon(alpha=.25) +
geom_line(aes(color=as.character(alpha))) +
facet_wrap(~ as.character(alpha)) +
coord_cartesian(xlim=(c(-5,0))) +
geom_vline(aes(xintercept=log(lambda.min.mean)), alpha=.5, data=met_amt_summary_lambda) +
geom_vline(aes(xintercept=log(lambda.min.high)), alpha=.5, data=met_amt_summary_lambda, color="blue")
Make a plot of MSE at minimum lambda for each alpha
met_amt_summary_cvm %>%
group_by(alpha) %>%
filter(rank(meancvm, ties.method = "first")==1) %>%
ggplot(aes(x=alpha,y=meancvm,ymin=low,ymax=high)) +
geom_ribbon(color=NA, fill="gray80") +
geom_line() +
geom_point()
not a particular large difference here after 0.2
Plot the number of nzero coefficients
met_amt_summary_lambda %>%
unnest(c(lambda, nzero)) %>%
group_by(alpha) %>%
filter(abs(lambda.min.mean-lambda)==min(abs(lambda.min.mean-lambda)) ) %>%
ungroup() %>%
ggplot(aes(x=as.character(alpha), y=nzero)) +
geom_point() +
ggtitle("Number of non-zero coefficents at minimum lambda") +
ylim(0,36)
OK let’s do repeated test train starting from these CV lambdas
multi_tt <- function(lambda, alpha, n=10000, sample_size=36, train_size=30, x, y=leaflength$leaf_avg_std) {
print(lambda)
print(alpha)
tt <-
tibble(run=1:n) %>%
mutate(train=map(run, ~ sample(1:sample_size, train_size))) %>%
mutate(fit=map(train, ~ glmnet(x=x[.,], y=y[.], lambda = lambda, alpha = alpha ))) %>%
mutate(pred=map2(fit, train, ~ predict(.x, newx = x[-.y,]))) %>%
mutate(cor=map2_dbl(pred, train, ~ cor(.x, y[-.y]) )) %>%
mutate(MSE=map2_dbl(pred, train, ~ mean((y[-.y] - .x)^2))) %>%
summarize(
num_na=sum(is.na(cor)),
num_lt_0=sum(cor<=0, na.rm=TRUE),
avg_cor=mean(cor, na.rm=TRUE),
avg_MSE=mean(MSE))
tt
}
amt_fit_test_train <- met_amt_summary_lambda %>%
select(alpha, lambda.min.mean)
amt_fit_test_train <- met_amt_multiCV %>%
filter(run==1) %>%
select(alpha, fit) %>%
right_join(amt_fit_test_train)
amt_fit_test_train <- amt_fit_test_train %>%
mutate(pred_full=map2(fit, lambda.min.mean, ~ predict(.x, s=.y, newx=met_amt.PCs)),
full_R=map_dbl(pred_full, ~ cor(.x, leaflength$leaf_avg_std)),
full_MSE=map_dbl(pred_full, ~ mean((leaflength$leaf_avg_std-.x)^2))) %>%
mutate(tt=map2(lambda.min.mean, alpha, ~ multi_tt(lambda=.x, alpha=.y, x=met_amt.PCs)))
(amt_fit_test_train <- amt_fit_test_train %>% unnest(tt))
amt_fit_test_train %>%
ggplot(aes(x=alpha)) +
geom_line(aes(y=avg_cor), color="red") +
geom_point(aes(y=avg_cor), color="red") +
geom_line(aes(y=avg_MSE), color="blue") +
geom_point(aes(y=avg_MSE), color="blue")
alpha of 0.8 to 1.0 are very similar and are the best here.
alpha_amt <- .8
best_amt <- amt_fit_test_train %>% filter(alpha == alpha_amt)
best_amt_fit <- best_amt$fit[[1]]
best_amt_lambda <- best_amt$lambda.min.mean
amt_coef.tb <- coef(best_amt_fit, s=best_amt_lambda) %>%
as.matrix() %>% as.data.frame() %>%
rownames_to_column(var="PC") %>%
rename(beta=`1`)
amt_coef.tb %>% filter(beta!=0) %>% arrange(beta)
pred and obs
plot(leaflength$leaf_avg_std, best_amt$pred_full[[1]])
cor.test(leaflength$leaf_avg_std, best_amt$pred_full[[1]]) #.736
best_amt$full_MSE
amt_vars <- amt_coef.tb %>%
filter(beta !=0, PC!="(Intercept)") %>%
pull(PC) %>% c("leaf_avg_std", .)
amt_relimp <- leaflength %>% select(leaf_avg_std) %>% cbind(met_amt.PCs) %>% as.data.frame() %>% dplyr::select(all_of(amt_vars)) %>%
calc.relimp()
amt_coef.tb <- amt_relimp@lmg %>% as.matrix() %>% as.data.frame() %>%
rownames_to_column("PC") %>%
rename(PropVar_met_amt=V1) %>%
full_join(amt_coef.tb) %>%
arrange(desc(PropVar_met_amt))
amt_coef.tb
lm with gt and trt
lmtest <- met_amt.leaf_PCA$x %>%
as.data.frame() %>%
rownames_to_column("sampleID") %>%
left_join(leaflength) %>%
select(sampleID, genotype, trt, starts_with("PC")) %>%
mutate(trt=ifelse(str_detect(trt, "dead|BLANK"), "deadBLANK", trt)) %>%
pivot_longer(cols=starts_with("PC"), names_to = "PC") %>%
mutate(PC=str_c("leaf_", PC)) %>%
group_by(PC) %>%
nest() %>%
mutate(lm_add=map(data, ~ lm(value ~ genotype + trt, data=.)),
lm_int=map(data, ~ lm(value ~ genotype*trt, data=.)))
lmtest %>% mutate(broomtidy = map(lm_add, tidy)) %>%
unnest(broomtidy) %>%
select(PC, term, p.value) %>%
filter(! str_detect(term, "Intercept"),
p.value < 0.1) %>%
arrange(term, p.value)
lmtest %>% mutate(broomtidy = map(lm_int, broom::tidy)) %>%
unnest(broomtidy) %>%
select(PC, term, p.value) %>%
filter(! str_detect(term, "Intercept"),
p.value < 0.1) %>%
arrange(term, p.value)
lmtest <- met_amt.root_PCA$x %>%
as.data.frame() %>%
rownames_to_column("sampleID") %>%
left_join(leaflength) %>%
select(sampleID, genotype, trt, starts_with("PC")) %>%
mutate(trt=ifelse(str_detect(trt, "dead|BLANK"), "deadBLANK", trt)) %>%
pivot_longer(cols=starts_with("PC"), names_to = "PC") %>%
mutate(PC=str_c("root_", PC)) %>%
group_by(PC) %>%
nest() %>%
mutate(lm_add=map(data, ~ lm(value ~ genotype + trt, data=.)),
lm_int=map(data, ~ lm(value ~ genotype*trt, data=.)))
lmtest %>% mutate(broomtidy = map(lm_add, tidy)) %>%
unnest(broomtidy) %>%
select(PC, term, p.value) %>%
filter(! str_detect(term, "Intercept"),
p.value < 0.1) %>%
arrange(term, p.value)
lmtest %>% mutate(broomtidy = map(lm_int, broom::tidy)) %>%
unnest(broomtidy) %>%
select(PC, term, p.value) %>%
filter(! str_detect(term, "Intercept"),
p.value < 0.1) %>%
arrange(term, p.value)
Checkout the rotations.
met_amt_rotation_out <- met_amt.PC_rotation %>%
pivot_longer(-metabolite, names_to="PC", values_to="loading") %>%
filter(PC %in% filter(amt_coef.tb, beta!=0)$PC ) %>%
group_by(PC) %>%
filter(!str_detect(metabolite,".*(leaf|root)_[0-9]*$")) %>%
filter(abs(loading) >= 0.05) %>%
left_join(amt_coef.tb, by="PC") %>%
arrange(desc(abs(beta)), desc(abs(loading))) %>%
mutate(organ=ifelse(str_detect(metabolite, "_leaf_"), "leaf", "root"),
transformation="raw",
metabolite=str_remove(metabolite, "met_amt_(root|leaf)_"),
metabolite_effect_on_leaf=ifelse(beta*loading>0, "increase", "decrease"))
met_amt_rotation_out %>% write_csv("../output/Leaf_associated_metabolites_raw.csv")
met_amt_rotation_out